# загрузим библиотеки для чтения данных
import pandas as pd
import numpy as np
!ls
first notebook.ipynb test_main_data.csv train_main_data.csv test_additional_data.csv train_additional_data.csv
# считаем данные в соответствующие датафреймы
train_main_df = pd.read_csv('train_main_data.csv')
train_additional_df = pd.read_csv('train_additional_data.csv')
test_main_df = pd.read_csv('test_main_data.csv')
test_additional_df = pd.read_csv('test_additional_data.csv')
train_main_df.shape
(29000, 13)
train_main_df.head()
| id | timestamp | full_sq | life_sq | floor | max_floor | material | build_year | num_room | kitch_sq | apartment condition | sub_area | price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 81237 | 2014-09-23 | 50 | 28.0 | 11.0 | 12.0 | 1.0 | 1984.0 | 2.0 | 8.0 | NaN | 77 | 8.908240e+06 |
| 1 | 96765 | 2014-07-31 | 42 | 28.0 | 1.0 | 5.0 | 2.0 | 1961.0 | 2.0 | 6.0 | 2.0 | 15 | 4.616088e+06 |
| 2 | 64804 | 2015-06-11 | 62 | NaN | 9.0 | 17.0 | 1.0 | NaN | 2.0 | 0.0 | NaN | 103 | 5.826177e+06 |
| 3 | 27439 | 2013-11-17 | 45 | 45.0 | 9.0 | 25.0 | 1.0 | 1.0 | 3.0 | 1.0 | 1.0 | 102 | 4.053249e+06 |
| 4 | 17258 | 2014-02-10 | 57 | 38.0 | 2.0 | 9.0 | 1.0 | 1968.0 | 3.0 | 6.0 | 3.0 | 15 | 2.024600e+06 |
train_additional_df.head()
| id | population | indust_part | preschool_facilities | school_facilities | hospital_beds_raion | healthcare_facilities | university_num | sport_objects_facilities | additional_education_facilities | culture_objects_facilities | shopping_centers_facilities | office_num | green_part | prom_part | cafe_count | church_facilities | mosque | leisure_facilities | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 81237 | 78616 | 0.093443 | 6 | 8 | 3300.0 | 2 | 1 | 11 | 1 | 0 | 4 | 5 | 16.06 | 2.68 | 2 | 0 | 1 | 0 |
| 1 | 96765 | 125354 | 0.265089 | 4 | 5 | 1937.0 | 3 | 0 | 6 | 4 | 0 | 2 | 0 | 3.24 | 0.00 | 2 | 0 | 0 | 0 |
| 2 | 64804 | 4001 | 0.007122 | 0 | 0 | NaN | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 3.33 | 3.70 | 2 | 0 | 0 | 0 |
| 3 | 27439 | 9553 | 0.072158 | 0 | 0 | NaN | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0.00 | 14.00 | 0 | 1 | 0 | 0 |
| 4 | 17258 | 125354 | 0.265089 | 4 | 5 | 1937.0 | 3 | 0 | 6 | 4 | 0 | 2 | 0 | 6.64 | 0.00 | 1 | 0 | 0 | 0 |
train_additional_df.shape
(29000, 19)
Рекомендуемые этапы анализа данных
# посмотрим на колонки, информацию о пустых значениях и типах данных
train_main_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 29000 entries, 0 to 28999 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 29000 non-null int64 1 timestamp 29000 non-null object 2 full_sq 29000 non-null int64 3 life_sq 22924 non-null float64 4 floor 28839 non-null float64 5 max_floor 19881 non-null float64 6 material 19881 non-null float64 7 build_year 16051 non-null float64 8 num_room 19881 non-null float64 9 kitch_sq 19881 non-null float64 10 apartment condition 16096 non-null float64 11 sub_area 29000 non-null int64 12 price 29000 non-null float64 dtypes: float64(9), int64(3), object(1) memory usage: 2.9+ MB
# теперь - сводные статистики по числовым переменным
train_main_df.describe()
| id | full_sq | life_sq | floor | max_floor | material | build_year | num_room | kitch_sq | apartment condition | sub_area | price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 29000.000000 | 29000.000000 | 22924.000000 | 28839.000000 | 19881.000000 | 19881.000000 | 1.605100e+04 | 19881.000000 | 19881.000000 | 16096.000000 | 29000.000000 | 2.900000e+04 |
| mean | 55032.038069 | 54.228000 | 34.418644 | 7.674954 | 12.536744 | 1.825210 | 3.127933e+03 | 1.910920 | 6.415070 | 2.106921 | 76.503172 | 5.772016e+06 |
| std | 25909.100401 | 38.553717 | 53.387672 | 5.319167 | 6.767270 | 1.478001 | 1.582588e+05 | 0.853365 | 28.953382 | 0.882152 | 39.199563 | 3.867065e+06 |
| min | 10017.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 8.098400e+04 |
| 25% | 32786.750000 | 38.000000 | 20.000000 | 3.000000 | 9.000000 | 1.000000 | 1.966000e+03 | 1.000000 | 1.000000 | 1.000000 | 45.000000 | 3.838643e+06 |
| 50% | 55005.000000 | 49.000000 | 30.000000 | 7.000000 | 12.000000 | 1.000000 | 1.979000e+03 | 2.000000 | 6.000000 | 2.000000 | 77.000000 | 5.088820e+06 |
| 75% | 77421.250000 | 63.000000 | 43.000000 | 11.000000 | 17.000000 | 2.000000 | 2.005000e+03 | 2.000000 | 9.000000 | 3.000000 | 105.000000 | 6.721672e+06 |
| max | 99993.000000 | 5326.000000 | 7478.000000 | 77.000000 | 117.000000 | 6.000000 | 2.005201e+07 | 19.000000 | 2014.000000 | 33.000000 | 145.000000 | 8.998222e+07 |
# конвертируем колонку в datetime
train_main_df['timestamp'] = pd.to_datetime(train_main_df['timestamp'])
# импортируем библиотеки для визуализаци данных
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
# нарисуем график зависимости цены от даты
plt.figure(figsize=(12, 6))
plt.scatter(train_main_df.timestamp, train_main_df.price)
plt.xlabel('timestamp', fontsize=12)
plt.ylabel('price', fontsize=12)
plt.show()
# нарисуем распределение цены
plt.figure(figsize=(12, 8))
sns.distplot(train_main_df.price.values, bins=50, kde=True)
plt.xlabel('price')
plt.show()
/Users/iMatvich/anaconda3/envs/main/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
# посмотрим на попарные графики
# в нижнем ряду все относящиеся к цене (price)
sns.pairplot(train_main_df)
<seaborn.axisgrid.PairGrid at 0x7fbbbc7794d0>
# нарисуем боксплоты для цены по количеству комант в квартире
# медиана цены растет в зависимости от количества комнат
# в данных явно проблемы: квартира с 0 комнат, дешевые квартиры с большим количеством комнат
plt.figure(figsize=(12, 8))
sns.boxplot(x='num_room', y='price', data=train_main_df)
plt.ylabel('price', fontsize=12)
plt.xlabel('num_room', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
# боксплот цены от количества этажей в доме
# условно можно разделить на группы: до 5 этажа включительно - дешевые пятиэтажки, от 6 до 25 этажа, от 26 до 32 этажа
# выброс для 4-этажного здания - максимальная цена
plt.figure(figsize=(12,8))
sns.boxplot(x='max_floor', y='price', data=train_main_df)
plt.ylabel('price', fontsize=12)
plt.xlabel('max_floor', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
# зависимости цены от этажа квартиры
# медианное изменение не такое большое
# надо внимательно посмотреть на выброс на 12 этаже
plt.figure(figsize=(12,8))
sns.boxplot(x='floor', y='price', data=train_main_df)
plt.ylabel('price', fontsize=12)
plt.xlabel('floor', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
# зависимость цены от общей площади
# виден разлет значений
plt.figure(figsize=(12,8))
sns.boxplot(x='full_sq', y='price', data=train_main_df)
plt.ylabel('price', fontsize=12)
plt.xlabel('full_sq', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
# посмотрим на распределение площади до 1000
# много значений до 100, далее - длинный хвост
train_main_df[train_main_df.full_sq < 1000].full_sq.hist(bins=50)
<AxesSubplot:>
# бинаризуем данные по площади
# длинный хвост пойдет в последний бин
bins = [0, 30, 40, 50, 60, 70, 80, 90, 100, 200, 5326]
train_main_df['full_sq_bins'] = np.searchsorted(bins, train_main_df.full_sq.values)
# построим зависимость цены по категориям, полученным на предыдущем шаге
# выбросы в 4 и 9 бине - желательно посмотреть отдельно
plt.figure(figsize=(12, 8))
sns.boxplot(x='full_sq_bins', y='price', data=train_main_df)
plt.ylabel('price', fontsize=12)
plt.xlabel('full_sq_bins', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
# добавим дополнительные данные
whole_train_df = train_main_df.merge(train_additional_df, how='left', on='id')
# снова посмтроим попарные графики, обратим внимание на новые для цены
# колонки на графиках - дискретные значения
sns.pairplot(whole_train_df)
<seaborn.axisgrid.PairGrid at 0x7fbbaa7ed950>
# посмотрим на зависимость цены от количества окружающих культурных объектов
# возможно, стоит посмотреть на записи с culture_objects_facilities == 10
plt.figure(figsize=(12, 8))
sns.boxplot(x='culture_objects_facilities', y='price', data=whole_train_df)
plt.ylabel('price', fontsize=12)
plt.xlabel('cultural_objects_facilities', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
# зависимость цены от количества объектов здравоохранения
# медиана практически не изменяется
plt.figure(figsize=(12, 8))
sns.boxplot(x='healthcare_facilities', y='price', data=whole_train_df)
plt.ylabel('price', fontsize=12)
plt.xlabel('healthcare_facilities', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
# зависимость цены от количества университетов рядом
# дороже квартиры, где рядом 1 или 3 университета
plt.figure(figsize=(12, 8))
sns.boxplot(x='university_num', y='price', data=whole_train_df)
plt.ylabel('price', fontsize=12)
plt.xlabel('university_num', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
# посмотрим на количество пропусков в данных
missing_df = whole_train_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.loc[(missing_df['missing_count'] > 0), :]
missing_df = missing_df.sort_values(by='missing_count')
ind = range(missing_df.shape[0])
fig, ax = plt.subplots(figsize=(12,18))
rects = ax.barh(ind, missing_df['missing_count'], color="blue")
ax.set_yticks(ind)
ax.set_yticklabels(missing_df.column_name.values, rotation='horizontal')
ax.set_xlabel("Count of missing values")
ax.set_title("Number of missing values in each column")
plt.show()
from IPython.display import display, Math, Latex
display(Math(r'MSE = \frac1N \sum ^{N}_{i=1} (y_i-\hat y_i)^2 '))
display(Math(r'RMSE = \sqrt{\frac1N \sum ^{N}_{i=1} (y_i-\hat y_i)^2 } = \sqrt {MSE}'))
display(Math(r'MАE = \frac1N \sum ^{N}_{i=1} |y_i-\hat y_i|'))
display(Math(r'R^2 = 1 - \frac{MSE(model)}{MSE(ocm)} '))
display(Math(r'MAPE = \frac{100%}{N} \sum ^{N}_{i=1} |\frac{y_i-\hat y_i}{y_i}| '))
display(Math(r'RMSLE = \sqrt{MSE(log(y_i+1),log(\hat y_i +1))}'))
train_main_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 29000 entries, 0 to 28999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 29000 non-null int64 1 timestamp 29000 non-null datetime64[ns] 2 full_sq 29000 non-null int64 3 life_sq 22924 non-null float64 4 floor 28839 non-null float64 5 max_floor 19881 non-null float64 6 material 19881 non-null float64 7 build_year 16051 non-null float64 8 num_room 19881 non-null float64 9 kitch_sq 19881 non-null float64 10 apartment condition 16096 non-null float64 11 sub_area 29000 non-null int64 12 price 29000 non-null float64 13 full_sq_bins 29000 non-null int64 dtypes: datetime64[ns](1), float64(9), int64(4) memory usage: 3.1 MB
whole_train_df
| id | timestamp | full_sq | life_sq | floor | max_floor | material | build_year | num_room | kitch_sq | ... | additional_education_facilities | culture_objects_facilities | shopping_centers_facilities | office_num | green_part | prom_part | cafe_count | church_facilities | mosque | leisure_facilities | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 81237 | 2014-09-23 | 50 | 28.0 | 11.0 | 12.0 | 1.0 | 1984.0 | 2.0 | 8.0 | ... | 1 | 0 | 4 | 5 | 16.06 | 2.68 | 2 | 0 | 1 | 0 |
| 1 | 96765 | 2014-07-31 | 42 | 28.0 | 1.0 | 5.0 | 2.0 | 1961.0 | 2.0 | 6.0 | ... | 4 | 0 | 2 | 0 | 3.24 | 0.00 | 2 | 0 | 0 | 0 |
| 2 | 64804 | 2015-06-11 | 62 | NaN | 9.0 | 17.0 | 1.0 | NaN | 2.0 | 0.0 | ... | 0 | 0 | 1 | 0 | 3.33 | 3.70 | 2 | 0 | 0 | 0 |
| 3 | 27439 | 2013-11-17 | 45 | 45.0 | 9.0 | 25.0 | 1.0 | 1.0 | 3.0 | 1.0 | ... | 0 | 0 | 0 | 1 | 0.00 | 14.00 | 0 | 1 | 0 | 0 |
| 4 | 17258 | 2014-02-10 | 57 | 38.0 | 2.0 | 9.0 | 1.0 | 1968.0 | 3.0 | 6.0 | ... | 4 | 0 | 2 | 0 | 6.64 | 0.00 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28995 | 96637 | 2014-09-13 | 65 | 64.0 | 6.0 | 18.0 | 2.0 | 2017.0 | 3.0 | 1.0 | ... | 0 | 0 | 0 | 0 | 16.86 | 2.72 | 0 | 0 | 0 | 0 |
| 28996 | 91710 | 2014-04-11 | 42 | 26.0 | 4.0 | 5.0 | 1.0 | 1963.0 | 2.0 | 5.0 | ... | 0 | 0 | 0 | 1 | 0.00 | 18.07 | 0 | 0 | 0 | 0 |
| 28997 | 71492 | 2014-02-13 | 44 | 27.0 | 7.0 | 9.0 | 1.0 | 1968.0 | 2.0 | 5.0 | ... | 3 | 0 | 10 | 11 | 1.11 | 3.82 | 2 | 0 | 0 | 0 |
| 28998 | 41071 | 2014-02-13 | 50 | 29.0 | 4.0 | 5.0 | 2.0 | 1956.0 | 2.0 | 7.0 | ... | 1 | 0 | 4 | 3 | 2.18 | 0.00 | 0 | 0 | 0 | 0 |
| 28999 | 41674 | 2013-09-17 | 42 | 26.0 | 2.0 | 5.0 | 5.0 | 1961.0 | 2.0 | 5.0 | ... | 4 | 0 | 2 | 4 | 7.43 | 0.00 | 0 | 1 | 0 | 0 |
29000 rows × 32 columns
# добавим дополнительные столбцы на основе имеющейся даты
# get year
whole_train_df['year'] = whole_train_df.timestamp.dt.year
# get month of year
whole_train_df['month'] = whole_train_df.timestamp.dt.month
# get day of week
whole_train_df['week_of_year'] = whole_train_df.timestamp.dt.weekofyear
# get week of the year
whole_train_df['day_of_week'] = whole_train_df.timestamp.dt.weekday
whole_train_df['timestamp_int'] = whole_train_df.timestamp.astype(int)
<ipython-input-34-d5956410835a>:10: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead. whole_train_df['week_of_year'] = whole_train_df.timestamp.dt.weekofyear
# создадим столбец для месяца года
whole_train_df['year_month'] = whole_train_df['year'].astype(str) + '_' + whole_train_df['month'].apply(lambda x: '{:02d}'.format(x))
# заполним все пропуски константой
# вы можете выбрать более подходящий вариант
whole_train_df.fillna(-99, inplace=True)
# вспомним, что цена сильно зависит от площади квартиры, на основе этих данных
# добавим столбцы для отношения площадей
whole_train_df["ratio_life_dash_full_sq"] = whole_train_df["life_sq"] / whole_train_df["full_sq"]
whole_train_df["ration_kitchen_dash_full_sq"] = whole_train_df["kitch_sq"] / whole_train_df["full_sq"]
# добавим воздраст здания
whole_train_df['age'] = whole_train_df["build_year"] - whole_train_df['year']
# добавим разность между общей и жилой площадью квартиры
whole_train_df['some_extra_sqr'] = whole_train_df["full_sq"] - whole_train_df["life_sq"]
whole_train_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 29000 entries, 0 to 28999 Data columns (total 42 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 29000 non-null int64 1 timestamp 29000 non-null datetime64[ns] 2 full_sq 29000 non-null int64 3 life_sq 29000 non-null float64 4 floor 29000 non-null float64 5 max_floor 29000 non-null float64 6 material 29000 non-null float64 7 build_year 29000 non-null float64 8 num_room 29000 non-null float64 9 kitch_sq 29000 non-null float64 10 apartment condition 29000 non-null float64 11 sub_area 29000 non-null int64 12 price 29000 non-null float64 13 full_sq_bins 29000 non-null int64 14 population 29000 non-null int64 15 indust_part 29000 non-null float64 16 preschool_facilities 29000 non-null int64 17 school_facilities 29000 non-null int64 18 hospital_beds_raion 29000 non-null float64 19 healthcare_facilities 29000 non-null int64 20 university_num 29000 non-null int64 21 sport_objects_facilities 29000 non-null int64 22 additional_education_facilities 29000 non-null int64 23 culture_objects_facilities 29000 non-null int64 24 shopping_centers_facilities 29000 non-null int64 25 office_num 29000 non-null int64 26 green_part 29000 non-null float64 27 prom_part 29000 non-null float64 28 cafe_count 29000 non-null int64 29 church_facilities 29000 non-null int64 30 mosque 29000 non-null int64 31 leisure_facilities 29000 non-null int64 32 year 29000 non-null int64 33 month 29000 non-null int64 34 week_of_year 29000 non-null int64 35 day_of_week 29000 non-null int64 36 timestamp_int 29000 non-null int64 37 year_month 29000 non-null object 38 ratio_life_dash_full_sq 28999 non-null float64 39 ration_kitchen_dash_full_sq 28998 non-null float64 40 age 29000 non-null float64 41 some_extra_sqr 29000 non-null float64 dtypes: datetime64[ns](1), float64(17), int64(23), object(1) memory usage: 10.8+ MB
# в тестовом датафрейме преобразуем колонку к нужному типу
test_main_df.timestamp = pd.to_datetime(test_main_df.timestamp)
test_main_df.timestamp.describe()
<ipython-input-40-a909b6a94011>:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now. test_main_df.timestamp.describe()
count 1000 unique 682 top 2014-12-21 00:00:00 freq 5 first 2011-11-01 00:00:00 last 2015-07-21 00:00:00 Name: timestamp, dtype: object
whole_train_df.timestamp.describe()
<ipython-input-41-10307d096943>:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now. whole_train_df.timestamp.describe()
count 29000 unique 1416 top 2014-12-19 00:00:00 freq 68 first 2011-09-18 00:00:00 last 2015-08-10 00:00:00 Name: timestamp, dtype: object
plt.figure(figsize=(12, 8))
sns.boxplot(x='year_month', y='price', data=whole_train_df.sort_values(by='year_month', ascending=True))
plt.ylabel('price', fontsize=12)
plt.xlabel('year_month', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
import xgboost as xgb
xgb_params = {
'eta': 0.05,
'max_depth': 4,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'min_child_weight':1,
'silent': 1,
'seed':0
}
# в валидационный датафрейм отправим данные после 2015-05-01, а учиться будем на более ранних данных
train_X = whole_train_df[whole_train_df.timestamp < pd.to_datetime('2015-05-01')]
train_y = whole_train_df[whole_train_df.timestamp < pd.to_datetime('2015-05-01')].price
val_X = whole_train_df.drop(['price','timestamp'], axis=1)[whole_train_df.timestamp >= pd.to_datetime('2015-05-01')]
val_y = whole_train_df[whole_train_df.timestamp >= pd.to_datetime('2015-05-01')].price
col_list = ['full_sq', 'life_sq', 'floor', 'max_floor',
'material', 'build_year', 'num_room', 'kitch_sq', 'apartment condition',
'sub_area', 'full_sq_bins', 'population', 'indust_part',
'preschool_facilities', 'school_facilities', 'hospital_beds_raion',
'healthcare_facilities', 'university_num', 'sport_objects_facilities',
'additional_education_facilities', 'culture_objects_facilities',
'shopping_centers_facilities', 'office_num', 'green_part', 'prom_part',
'cafe_count', 'church_facilities', 'mosque', 'leisure_facilities',
'year', 'month', 'week_of_year', 'day_of_week', 'timestamp_int',
'ratio_life_dash_full_sq', 'ration_kitchen_dash_full_sq',
'age', 'some_extra_sqr']
# создадим данные для обучения и валидации
xgb_train = xgb.DMatrix(train_X[col_list], train_y, feature_names = col_list, enable_categorical=True)
xgb_test = xgb.DMatrix(val_X[col_list], val_y, feature_names = col_list, enable_categorical=True)
evallist = [(xgb_test, 'eval'), (xgb_train, 'train')]
model = xgb.train(params = xgb_params,
dtrain = xgb_train,
num_boost_round = 300,
evals = evallist,
early_stopping_rounds = 10,
verbose_eval = 10)
[23:20:08] WARNING: /Users/travis/build/dmlc/xgboost/src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
[23:20:08] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:541:
Parameters: { silent } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] eval-rmse:7476762.00000 train-rmse:6610620.50000
[10] eval-rmse:5179562.50000 train-rmse:4468374.00000
[20] eval-rmse:3929113.75000 train-rmse:3345265.00000
[30] eval-rmse:3226784.50000 train-rmse:2763592.00000
[40] eval-rmse:2873099.00000 train-rmse:2490896.50000
[50] eval-rmse:2666433.50000 train-rmse:2345573.50000
[60] eval-rmse:2526168.00000 train-rmse:2259500.25000
[70] eval-rmse:2460093.75000 train-rmse:2204568.75000
[80] eval-rmse:2395885.50000 train-rmse:2163670.00000
[90] eval-rmse:2359611.25000 train-rmse:2133465.75000
[100] eval-rmse:2334044.75000 train-rmse:2109482.50000
[110] eval-rmse:2313466.25000 train-rmse:2087167.25000
[120] eval-rmse:2298799.00000 train-rmse:2068287.12500
[130] eval-rmse:2289180.75000 train-rmse:2049894.87500
[140] eval-rmse:2269619.00000 train-rmse:2035080.00000
[150] eval-rmse:2256844.75000 train-rmse:2017779.75000
[160] eval-rmse:2243176.75000 train-rmse:2001272.87500
[170] eval-rmse:2236593.25000 train-rmse:1988067.12500
[180] eval-rmse:2231386.25000 train-rmse:1976977.87500
[190] eval-rmse:2223544.50000 train-rmse:1964085.00000
[200] eval-rmse:2215509.75000 train-rmse:1950872.62500
[210] eval-rmse:2209716.25000 train-rmse:1939384.00000
[220] eval-rmse:2207400.75000 train-rmse:1931920.00000
[230] eval-rmse:2199996.25000 train-rmse:1923101.12500
[240] eval-rmse:2190625.75000 train-rmse:1913317.75000
[250] eval-rmse:2186002.75000 train-rmse:1905027.87500
[260] eval-rmse:2178664.25000 train-rmse:1897488.25000
[270] eval-rmse:2169142.50000 train-rmse:1890111.25000
[280] eval-rmse:2164209.00000 train-rmse:1883142.75000
[290] eval-rmse:2162188.25000 train-rmse:1875855.62500
[299] eval-rmse:2158237.50000 train-rmse:1869086.37500
from sklearn.model_selection import train_test_split
# теперь выберем для валидации случайные записи, а не деление по времени
X_train, X_test, y_train, y_test = train_test_split(
whole_train_df[col_list],
whole_train_df.price, test_size=1425, random_state=42)
xgb_params = {
'eta': 0.05,
'max_depth': 4,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'min_child_weight':1,
'silent': 1,
'seed':0
}
xgb_train = xgb.DMatrix(X_train, y_train, feature_names = col_list, enable_categorical=True)
xgb_test = xgb.DMatrix(X_test, y_test, feature_names = col_list, enable_categorical=True)
evallist = [(xgb_test, 'eval'), (xgb_train, 'train')]
model_2 = xgb.train(params = xgb_params,
dtrain = xgb_train,
num_boost_round = 300,
evals = evallist,
early_stopping_rounds = 10,
verbose_eval = 10)
[23:20:43] WARNING: /Users/travis/build/dmlc/xgboost/src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
[23:20:43] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:541:
Parameters: { silent } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] eval-rmse:7185233.50000 train-rmse:6625482.50000
[10] eval-rmse:5049912.00000 train-rmse:4473881.50000
[20] eval-rmse:3929034.00000 train-rmse:3344742.25000
[30] eval-rmse:3342730.75000 train-rmse:2763428.25000
[40] eval-rmse:3051133.75000 train-rmse:2482626.75000
[50] eval-rmse:2887214.75000 train-rmse:2328527.50000
[60] eval-rmse:2756966.25000 train-rmse:2244728.75000
[70] eval-rmse:2692968.00000 train-rmse:2194242.00000
[80] eval-rmse:2638506.75000 train-rmse:2154196.75000
[90] eval-rmse:2591377.50000 train-rmse:2123792.25000
[100] eval-rmse:2568987.75000 train-rmse:2099208.50000
[110] eval-rmse:2545619.25000 train-rmse:2079226.00000
[120] eval-rmse:2523972.75000 train-rmse:2058409.00000
[130] eval-rmse:2499134.75000 train-rmse:2037765.12500
[140] eval-rmse:2481549.25000 train-rmse:2020302.62500
[150] eval-rmse:2468394.25000 train-rmse:2005522.25000
[160] eval-rmse:2456246.25000 train-rmse:1990327.50000
[170] eval-rmse:2443469.50000 train-rmse:1977714.75000
[180] eval-rmse:2431326.50000 train-rmse:1967834.50000
[190] eval-rmse:2423893.50000 train-rmse:1958942.25000
[200] eval-rmse:2413961.25000 train-rmse:1948399.00000
[210] eval-rmse:2409204.25000 train-rmse:1930707.25000
[220] eval-rmse:2406664.75000 train-rmse:1920662.00000
[230] eval-rmse:2396993.75000 train-rmse:1911087.87500
[240] eval-rmse:2398797.75000 train-rmse:1901164.00000
[250] eval-rmse:2394962.00000 train-rmse:1892672.00000
[260] eval-rmse:2391643.25000 train-rmse:1884192.37500
[270] eval-rmse:2388217.00000 train-rmse:1877024.25000
[280] eval-rmse:2385589.75000 train-rmse:1867392.25000
[290] eval-rmse:2377887.25000 train-rmse:1861794.37500
[299] eval-rmse:2371819.75000 train-rmse:1854479.00000
# подумайте, почему мы получили такие результаты для моделей по RMSE?
from xgboost import plot_importance
# посмотрим на важность фичей
# full_sq - самая важная, при этом падение в важности заметное
# можно подумать над исправлением данного момента
plot_importance(model_2,max_num_features=15, height=0.9)
<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>
# посмотрим на ошибки наших предсказаний
scores = pd.DataFrame(val_y)
scores['predicted'] = model.predict(xgb_test)
scores['error'] = scores.price - scores.predicted
scores
| price | predicted | error | |
|---|---|---|---|
| 2 | 5.826177e+06 | 8111354.50 | -2.285178e+06 |
| 50 | 2.073190e+06 | 5500121.00 | -3.426931e+06 |
| 107 | 4.656580e+06 | 4150533.25 | 5.060468e+05 |
| 144 | 8.098400e+05 | 4242245.00 | -3.432405e+06 |
| 151 | 9.521222e+06 | 5399052.00 | 4.122170e+06 |
| ... | ... | ... | ... |
| 28897 | 5.506912e+06 | 3714926.00 | 1.791986e+06 |
| 28940 | 1.077087e+07 | 5070447.00 | 5.700425e+06 |
| 28948 | 6.990122e+06 | 7993993.00 | -1.003871e+06 |
| 28970 | 7.936432e+06 | 7127760.50 | 8.086715e+05 |
| 28973 | 2.935216e+06 | 5399166.00 | -2.463950e+06 |
1425 rows × 3 columns
scores['error'].describe()
count 1.425000e+03 mean 5.881209e+05 std 5.687290e+06 min -6.665510e+07 25% -1.751808e+06 50% 4.596885e+05 75% 2.738142e+06 max 5.278702e+07 Name: error, dtype: float64
# зная примеры, на которых большие ошибки, можно пробовать тюнить модель